/*
CLEANING SOKATPER

Data 	: DS_Lev_Sokatper.dta
Folder 	: ../../rawdata/dtafiles/DS_Lev_Sokatper.dta
Date	: 2017-02-13

Creator		: Jonas Cederlof	(JC)
Description : 
Notes:

LATEST UPDATE : 2017-03-22
*/

********************************************************************************
clear
set more		off
cap   log close 	_all

log using 	"../log/A1_clean_sokatper.smcl", replace 
use 		"$rawdatapath/DS_Lev_Sokatper.dta"




{ // Basic Data Preperation
*******************************************************************************

// Destring date variable (only one date variable is not Stata format)
*******************************************************************************
foreach var of varlist inska_dat utska_dat{
	gen 		year 			= substr(`var',1,4)	
	gen 		month 			= substr(`var',6,2)
	gen 		day 			= substr(`var',9,2)
	destring 	year month day		, replace

	gen 		temp_`var' 	 	= mdy(month , day , year) 
	format 		temp_`var'   	 %td

	drop  		year month day 	 	`var'
	rename   	temp_`var'	  		`var'
}


// Rename variables
********************************************************************************
rename  	lopnr 		persid


// Label variables
*******************************************************************************
lab var 	persid 	 	"ID: individual"
lab var 	inlnr 	 	"ID: unemployment spell"
lab var 	sokatlnr  	"ID: search category (within inlnr)"
lab var 	skat 	 	"Search category"
lab var 	inska_dat  	"Timestamp: begining of search category"
lab var 	utska_dat  	"Timestamp: ending of search category"
lab var 	antdgr 	 	"Number of days between inska_dat and utska_dat"

}
*

{ // First look at the data
*******************************************************************************
* The dates are not very coherent. In reality inska_dat<utska_dat but in about
* 50% of the time it is not. This is though leviated when collapsing the data 
* below within spells.

*sum
*misstable sum
}
*


*Dropping unemployment spells that starts before 2000
drop if inska_dat<mdy(1,1,2000)

*save "../data/sokattemp.dta", replace

{ // Calculating number of days unemployed during a year
*******************************************************************************
* As PES definines spells using the inlnr I collapse the data by persid and inlnr
* taking the first (last) date as the start (end) of the spell.
sort persid inska_dat utska_dat  

* Top coding when utska_dat==.
sum 		 utska_dat , format
replace 	 utska_dat  = r(max) 	if utska_dat==.	
*destring 	 skat, replace


*Take the mode of search category in each spell
*bys persid inlnr : egen skatmode = mode(skat) , maxmode

*Since the collapse is done by inlnr we know that the indivual is registered on 
*PES as deregistration renders a new inlnr.
collapse (min) inska_dat (max) utska_dat  , by(persid inlnr)


*dropping time inconsistent observations 
drop if utska_dat <= inska_dat


{ // Fixing overlapping spells
*===============================================================================
*Within a persid, there are some spells that is registered to end after a new 
*one has begun (which in itself has a later enddate).

*Most of the spells have the same inska_dat but different utska_dat
*I therefore collapse by startdate and take the highest enddate
collapse (min) inlnr (max)  utska_dat, by(persid inska_dat)


*1 percent of indivuals have a spell that has a spell ending after a new one has started
bys persid : gen temp = inska_dat - utska_dat[_n-1] if utska_dat[_n-1]!=.
bys persid : egen invalidspell = max(temp<0)
sum invalidspell 	

*Replacing lower endates with the higher one
bys persid (inska_dat) : replace utska_dat = utska_dat[_n+1] if temp[_n+1]<0 & temp[_n+1]!=.

*Dropping the spells that are partly subsets of the eariler ones
drop if temp<0
drop 	temp
}
*Note: There should now exist no overlapping spells 


*Generate month and year variabels for inska_dat and utska_dat
gen inska_month = ym(year(inska_dat), month(inska_dat))
gen utska_month = ym(year(utska_dat), month(utska_dat))
format inska_month utska_month %tm


*Expanding data monthly by persid and inlnr 
gen duration = utska_month - inska_month + 1
expand duration
sort persid inska_month

*Generating a time (month/year) variable for each observation
bys persid inlnr (inska_dat) : gen month = inska_month + _n -1
format month %tm

*Settgin startdate for each month (if first month of spell startdate==inska_dat)
bys persid inlnr (inska_dat) : gen startdate = cond(inska_month==month, inska_dat, dofm(month)) 
*Setting endate to the 30th for full months (if last month of spell enddate==urska_dat_dat)
bys persid inlnr (inska_dat) : gen enddate   = cond(utska_month==month, utska_dat, mdy(month( dofm(month) ) ,30, year( dofm(month) )) )
*Fixing February (which has only 28 days)
replace  enddate = mdy(2,28, year( dofm(month)) ) if enddate==. & month(dofm(month))==2
*Format set date variabels
format startdate enddate %td

*Rename
rename startdate	unemp_startdate
rename enddate		unemp_enddate
rename month 		date

*Generate days of unemployment per month
gen daysunemp = unemp_enddate - unemp_startdate +1
keep persid unemp_startdate unemp_enddate daysunemp date


*Some duplicates rendered when spells is starts, ends and starts agian in one month
duplicates tag persid date, gen(dup)
tab dup
*br if dup>0
*br if persid==325509

collapse (sum) daysunemp (min) unemp_startdate (max) unemp_enddate, by(persid date)

replace daysunemp=30 if daysunemp>30

*Save monthly unemployment data
compress
gen year = year(dofm(date))
save "$datapath/A1_sokatper_by_month.dta",replace





gcollapse (sum) daysunemp, by(persid year)

rename daysunemp annual_daysunemp
compress
save "$datapath/A1_sokatper_by_year.dta",replace



}
*

log close





